1@/*****************************************************************************
2@*
3@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4@*
5@* Licensed under the Apache License, Version 2.0 (the "License");
6@* you may not use this file except in compliance with the License.
7@* You may obtain a copy of the License at:
8@*
9@* http://www.apache.org/licenses/LICENSE-2.0
10@*
11@* Unless required by applicable law or agreed to in writing, software
12@* distributed under the License is distributed on an "AS IS" BASIS,
13@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14@* See the License for the specific language governing permissions and
15@* limitations under the License.
16@*
17@*****************************************************************************/
18@/**
19@*******************************************************************************
20@* @file
21@*  ihevc_intra_pred_filters_planar.s
22@*
23@* @brief
24@*  contains function definitions for inter prediction  interpolation.
25@* functions are coded using neon  intrinsics and can be compiled using
26
27@* rvct
28@*
29@* @author
30@*  akshaya mukund
31@*
32@* @par list of functions:
33@*
34@*
35@* @remarks
36@*  none
37@*
38@*******************************************************************************
39@*/
40@/**
41@*******************************************************************************
42@*
43@* @brief
44@*    luma intraprediction filter for planar input
45@*
46@* @par description:
47@*
48@* @param[in] pu1_ref
49@*  uword8 pointer to the source
50@*
51@* @param[out] pu1_dst
52@*  uword8 pointer to the destination
53@*
54@* @param[in] src_strd
55@*  integer source stride
56@*
57@* @param[in] dst_strd
58@*  integer destination stride
59@*
60@* @param[in] pi1_coeff
61@*  word8 pointer to the planar coefficients
62@*
63@* @param[in] nt
64@*  size of tranform block
65@*
66@* @param[in] mode
67@*  type of filtering
68@*
69@* @returns
70@*
71@* @remarks
72@*  none
73@*
74@*******************************************************************************
75@*/
76
77@void ihevc_intra_pred_luma_planar(uword8* pu1_ref,
78@                                  word32 src_strd,
79@                                  uword8* pu1_dst,
80@                                  word32 dst_strd,
81@                                  word32 nt,
82@                                  word32 mode,
83@                  word32 pi1_coeff)
84@**************variables vs registers*****************************************
85@r0 => *pu1_ref
86@r1 => src_strd
87@r2 => *pu1_dst
88@r3 => dst_strd
89
90@stack contents from #104
91@   nt
92@   mode
93@   pi1_coeff
94
95.equ    nt_offset,      104
96
97.text
98.align 4
99
100
101
102
103.globl ihevc_intra_pred_luma_planar_a9q
104.extern gau1_ihevc_planar_factor
105.extern gau1_ihevc_planar_factor_1
106
107gau1_ihevc_planar_factor_addr:
108.long gau1_ihevc_planar_factor - ulbl1 - 8
109
110gau1_ihevc_planar_factor_1_addr:
111.long gau1_ihevc_planar_factor_1 - ulbl2 - 8
112
113
114.type ihevc_intra_pred_luma_planar_a9q, %function
115
116ihevc_intra_pred_luma_planar_a9q:
117
118    stmfd       sp!, {r4-r12, r14}          @stack stores the values of the arguments
119    vpush       {d8 - d15}
120    ldr         r4,[sp,#nt_offset]          @loads nt
121    ldr         r11, gau1_ihevc_planar_factor_addr @loads table of coeffs
122ulbl1:
123    add         r11,r11,pc
124
125    clz         r5, r4
126    rsb         r5, r5, #32
127    vdup.16     q7, r5
128    vneg.s16    q7, q7                      @shr value (so vneg)
129    vdup.8      d2, r4                      @nt
130    vdup.s16    q8, r4                      @nt
131
132    sub         r6, r4, #1                  @nt-1
133    add         r6, r6, r0
134    ldr         r7, [r6]
135    vdup.s8     d0, r7                      @src[nt-1]
136
137    add         r6, r4, r4,lsl #1           @3nt
138    add         r6, r6, #1                  @3nt + 1
139    add         r6, r6, r0
140    ldr         r7, [r6]
141    vdup.s8     d1, r7                      @src[3nt+1]
142
143    add         r6, r4, r4                  @2nt
144    add         r14, r6, #1                 @2nt+1
145    sub         r6, r6, #1                  @2nt-1
146    add         r6, r6, r0                  @&src[2nt-1]
147    add         r14, r14, r0                @&src[2nt+1]
148
149    mov         r8, #1                      @row+1 (row is first 0)
150    sub         r9, r4, r8                  @nt-1-row (row is first 0)
151
152    vdup.s8     d5, r8                      @row + 1
153    vdup.s8     d6, r9                      @nt - 1 - row
154    vmov        d7, d5                      @mov #1 to d7 to used for inc for row+1 and dec for nt-1-row
155
156    add         r12, r11, #1                @coeffs (to be reloaded after every row)
157    mov         r1, r4                      @nt (row counter) (dec after every row)
158    mov         r5, r2                      @dst (to be reloaded after every row and inc by dst_strd)
159    mov         r10, #8                     @increment for the coeffs
160    mov         r0, r14                     @&src[2nt+1] (to be reloaded after every row)
161
162    cmp         r4, #4
163    beq         tf_sz_4
164
165@@ ========== ***************** =====================
166prolog:
167tf_sz_8_16_32:
168
169    mov         r7, r4                      @column counter (set to no of cols)
170    mov         r9, r4, lsr #3              @divide nt by 8
171    mul         r7, r7, r9                  @multiply width * height
172    ldr         r5, gau1_ihevc_planar_factor_1_addr @loads table of coeffs
173ulbl2:
174    add         r5,r5,pc
175    sub         r6, r6, #7
176    mov         r8, r2
177    lsl         r9, r3, #3                  @4*stride
178    rsb         r9, r9, #8                  @8-4*stride
179    mov         r10, r4                     @nt
180    sub         r10, r10, #8                @nt - 8
181
182col_loop_8_16_32:
183
184    vld1.s8     d8, [r12]                   @(1-8)load 8 coeffs [col+1]
185    vdup.16     q6, r4                      @(1)
186    vld1.s8     d4, [r6]                    @(1-8)src[2nt-1-row]
187    vsub.s8     d9, d2, d8                  @(1-8)[nt-1-col]
188
189
190    vmlal.u8    q6, d5, d0                  @(1)(row+1) *   src[nt-1]
191
192    vld1.s8     d3, [r14]                   @(1-8)load 8 src[2nt+1+col]
193    vmlal.u8    q6, d8, d1                  @(1)(col+1) *   src[3nt+1]
194
195    vdup.s8     d20, d4[7]                  @(1)
196    vmlal.u8    q6, d6, d3                  @(1)(nt-1-row)  *   src[2nt+1+col]
197
198    vdup.s8     d21, d4[6]                  @(2)
199    vmlal.u8    q6, d9, d20                 @(1)(nt-1-col)  *   src[2nt-1-row]
200
201    vdup.16     q15, r4                     @(2)
202    vadd.s8     d5, d5, d7                  @(1)
203
204    vsub.s8     d6, d6, d7                  @(1)
205
206    vdup.s8     d22, d4[5]                  @(3)
207    vmlal.u8    q15, d5, d0                 @(2)
208
209    vdup.16     q14, r4                     @(3)
210    vmlal.u8    q15, d8, d1                 @(2)
211
212    vmlal.u8    q15, d6, d3                 @(2)
213    vmlal.u8    q15, d9, d21                @(2)
214
215    vshl.s16    q6, q6, q7                  @(1)shr
216
217    vadd.s8     d5, d5, d7                  @(2)
218    vsub.s8     d6, d6, d7                  @(2)
219
220    vmovn.i16   d12, q6                     @(1)
221    vmlal.u8    q14, d5, d0                 @(3)
222
223    vdup.8      d23, d4[4]                  @(4)
224    vmlal.u8    q14, d8, d1                 @(3)
225
226    vdup.16     q5, r4                      @(4)
227    vmlal.u8    q14, d6, d3                 @(3)
228
229    vst1.s8     d12, [r2], r3               @(1)str 8 values
230    vmlal.u8    q14, d9, d22                @(3)
231
232    vshl.s16    q15, q15, q7                @(2)shr
233
234    vadd.s8     d5, d5, d7                  @(3)
235    vsub.s8     d6, d6, d7                  @(3)
236
237    vmovn.i16   d30, q15                    @(2)
238    vmlal.u8    q5, d5, d0                  @(4)
239
240    vdup.8      d20, d4[3]                  @(5)
241    vmlal.u8    q5, d8, d1                  @(4)
242
243    vdup.16     q8, r4                      @(5)
244    vmlal.u8    q5, d6, d3                  @(4)
245
246    vst1.s8     d30, [r2], r3               @(2)str 8 values
247    vmlal.u8    q5, d9, d23                 @(4)
248
249    vshl.s16    q14, q14, q7                @(3)shr
250
251    vadd.s8     d5, d5, d7                  @(4)
252    vsub.s8     d6, d6, d7                  @(4)
253
254    vmovn.i16   d28, q14                    @(3)
255    vmlal.u8    q8, d5, d0                  @(5)
256
257    vdup.8      d21, d4[2]                  @(6)
258    vmlal.u8    q8, d8, d1                  @(5)
259
260    vdup.16     q9, r4                      @(6)
261    vmlal.u8    q8, d6, d3                  @(5)
262
263    vst1.s8     d28, [r2], r3               @(3)str 8 values
264    vmlal.u8    q8, d9, d20                 @(5)
265
266    vshl.s16    q5, q5, q7                  @(4)shr
267    vadd.s8     d5, d5, d7                  @(5)
268    vsub.s8     d6, d6, d7                  @(5)
269
270    vmovn.i16   d10, q5                     @(4)
271    vmlal.u8    q9, d5, d0                  @(6)
272
273    vdup.8      d22, d4[1]                  @(7)
274    vmlal.u8    q9, d8, d1                  @(6)
275
276    vdup.16     q13, r4                     @(7)
277    vmlal.u8    q9, d6, d3                  @(6)
278
279    vst1.s8     d10, [r2], r3               @(4)str 8 values
280    vmlal.u8    q9, d9, d21                 @(6)
281
282    vshl.s16    q8, q8, q7                  @(5)shr
283
284    vadd.s8     d5, d5, d7                  @(6)
285    vsub.s8     d6, d6, d7                  @(6)
286
287    vmovn.i16   d16, q8                     @(5)
288    vmlal.u8    q13, d5, d0                 @(7)
289
290    vdup.8      d23, d4[0]                  @(8)
291    vmlal.u8    q13, d8, d1                 @(7)
292
293    vdup.16     q12, r4                     @(8)
294    vmlal.u8    q13, d6, d3                 @(7)
295
296    vst1.s8     d16, [r2], r3               @(5)str 8 values
297    vmlal.u8    q13, d9, d22                @(7)
298
299    vshl.s16    q9, q9, q7                  @(6)shr
300
301    vadd.s8     d5, d5, d7                  @(7)
302    vsub.s8     d6, d6, d7                  @(7)
303
304    vmovn.i16   d18, q9                     @(6)
305    vmlal.u8    q12, d5, d0                 @(8)
306
307
308    vmlal.u8    q12, d8, d1                 @(8)
309
310    vmlal.u8    q12, d6, d3                 @(8)
311
312    vst1.s8     d18, [r2], r3               @(6)str 8 values
313    vmlal.u8    q12, d9, d23                @(8)
314
315    vshl.s16    q13, q13, q7                @(7)shr
316
317    subs        r7, r7, #8
318
319    beq         epilog
320
321    subs        r1, r1, #8                  @row counter
322    addgt       r12, r12, #8                @col inc
323    addgt       r14, r14, #8                @also for col inc
324    movle       r1, r4                      @nt reloaded (refresh the value)
325    addle       r12, r11, #1                @r12 reset
326
327    movle       r14, r0                     @r14 reset
328    vld1.s8     d8, [r12]                   @(1n)(1-8)load 8 coeffs [col+1]
329
330    suble       r6, r6, #8                  @for next set of rows
331    vld1.s8     d3, [r14]                   @(1n)(1-8)load 8 src[2nt+1+col]
332
333    addle       r5, r5, #8
334    vdup.16     q6, r4                      @(1n)(1)
335
336    vld1.s8     d5, [r5]
337
338    vld1.s8     d4, [r6]                    @(1n)(1-8)src[2nt-1-row]
339    vsub.s8     d9, d2, d8                  @(1n)(1-8)[nt-1-col]
340
341    vdup.s8     d20, d4[7]                  @(1n)(1)
342    vsub.s8     d6, d2, d5
343
344    beq         epilog
345
346kernel_plnr:
347
348    cmp         r1, #0                      @ (cond loop)
349    vshl.s16    q12, q12, q7                @(8)shr
350
351    vmovn.i16   d26, q13                    @(7)
352    vmlal.u8    q6, d5, d0                  @(1)(row+1) *   src[nt-1]
353
354    vmovn.i16   d24, q12                    @(8)
355    vmlal.u8    q6, d8, d1                  @(1)(col+1) *   src[3nt+1]
356
357    vdup.s8     d21, d4[6]                  @(2)
358    vmlal.u8    q6, d6, d3                  @(1)(nt-1-row)  *   src[2nt+1+col]
359
360    vdup.16     q15, r4                     @(2)
361    vmlal.u8    q6, d9, d20                 @(1)(nt-1-col)  *   src[2nt-1-row]
362
363    vst1.s8     d26, [r2], r3               @(7)str 8 values
364    vadd.s8     d5, d5, d7                  @(1)
365
366    vst1.s8     d24, [r2], r3               @(8)str 8 values
367    vsub.s8     d6, d6, d7                  @(1)
368
369    addgt       r2, r2, r9                  @since more cols to fill, dst + 8 - 6*strd (cond loop)
370    vmlal.u8    q15, d5, d0                 @(2)
371
372    suble       r2, r2, r10                 @else go to next set of rows, dst - (nt-8) (cond loop)
373    vmlal.u8    q15, d8, d1                 @(2)
374
375    vdup.s8     d22, d4[5]                  @(3)
376    vmlal.u8    q15, d6, d3                 @(2)
377
378    vdup.16     q14, r4                     @(3)
379    vmlal.u8    q15, d9, d21                @(2)
380
381    vshl.s16    q6, q6, q7                  @(1)shr
382
383    vadd.s8     d5, d5, d7                  @(2)
384    movle       r1, r4                      @nt reloaded (refresh the value)    (cond loop)
385
386    vsub.s8     d6, d6, d7                  @(2)
387    subs        r1, r1, #8                  @row counter (loop)
388
389    vmovn.i16   d12, q6                     @(1)
390    vmlal.u8    q14, d5, d0                 @(3)
391
392    vdup.8      d23, d4[4]                  @(4)
393    vmlal.u8    q14, d8, d1                 @(3)
394
395    vdup.16     q5, r4                      @(4)
396    vmlal.u8    q14, d6, d3                 @(3)
397
398    vst1.s8     d12, [r2], r3               @(1)str 8 values
399    vmlal.u8    q14, d9, d22                @(3)
400
401    vshl.s16    q15, q15, q7                @(2)shr
402
403    vadd.s8     d5, d5, d7                  @(3)
404
405    vsub.s8     d6, d6, d7                  @(3)
406
407    vmovn.i16   d30, q15                    @(2)
408    vmlal.u8    q5, d5, d0                  @(4)
409
410    vdup.8      d20, d4[3]                  @(5)
411    vmlal.u8    q5, d8, d1                  @(4)
412
413    vdup.16     q8, r4                      @(5)
414    vmlal.u8    q5, d6, d3                  @(4)
415
416    vst1.s8     d30, [r2], r3               @(2)str 8 values
417    vmlal.u8    q5, d9, d23                 @(4)
418
419    vshl.s16    q14, q14, q7                @(3)shr
420
421    vadd.s8     d5, d5, d7                  @(4)
422
423    vsub.s8     d6, d6, d7                  @(4)
424
425    vmovn.i16   d28, q14                    @(3)
426    vmlal.u8    q8, d5, d0                  @(5)
427
428    vdup.8      d21, d4[2]                  @(6)
429    vmlal.u8    q8, d8, d1                  @(5)
430
431    vdup.16     q9, r4                      @(6)
432    vmlal.u8    q8, d6, d3                  @(5)
433
434    vst1.s8     d28, [r2], r3               @(3)str 8 values
435    vmlal.u8    q8, d9, d20                 @(5)
436
437    addle       r12, r11, #1                @r12 reset (cond loop)
438    vshl.s16    q5, q5, q7                  @(4)shr
439
440    addgt       r12, r12, #8                @col inc (cond loop)
441    vadd.s8     d5, d5, d7                  @(5)
442
443    addgt       r14, r14, #8                @also for col inc (cond loop)
444    vsub.s8     d6, d6, d7                  @(5)
445
446    vmovn.i16   d10, q5                     @(4)
447    vmlal.u8    q9, d5, d0                  @(6)
448
449    vdup.8      d22, d4[1]                  @(7)
450    vmlal.u8    q9, d8, d1                  @(6)
451
452    vdup.16     q13, r4                     @(7)
453    vmlal.u8    q9, d6, d3                  @(6)
454
455    vst1.s8     d10, [r2], r3               @(4)str 8 values
456    vmlal.u8    q9, d9, d21                 @(6)
457
458    movle       r14, r0                     @r14 reset (cond loop)
459    vshl.s16    q8, q8, q7                  @(5)shr
460
461    suble       r6, r6, #8                  @for next set of rows (cond loop)
462    vadd.s8     d5, d5, d7                  @(6)
463
464    addle       r5, r5, #8                  @ (cond loop)
465    vsub.s8     d6, d6, d7                  @(6)
466
467    vmovn.i16   d16, q8                     @(5)
468    vmlal.u8    q13, d5, d0                 @(7)
469
470    vdup.8      d23, d4[0]                  @(8)
471    vmlal.u8    q13, d8, d1                 @(7)
472
473    vdup.16     q12, r4                     @(8)
474    vmlal.u8    q13, d6, d3                 @(7)
475
476    vst1.s8     d16, [r2], r3               @(5)str 8 values
477    vmlal.u8    q13, d9, d22                @(7)
478
479    vld1.s8     d4, [r6]                    @(1n)(1-8)src[2nt-1-row]
480    vshl.s16    q9, q9, q7                  @(6)shr
481
482    vadd.s8     d5, d5, d7                  @(7)
483
484    vsub.s8     d6, d6, d7                  @(7)
485
486    vmovn.i16   d18, q9                     @(6)
487    vmlal.u8    q12, d5, d0                 @(8)
488
489    vld1.s8     d5, [r5]                    @(row+1 value)
490    vmlal.u8    q12, d8, d1                 @(8)
491
492    vdup.s8     d20, d4[7]                  @(1n)(1)
493    vmlal.u8    q12, d6, d3                 @(8)
494
495    vst1.s8     d18, [r2], r3               @(6)str 8 values
496    vmlal.u8    q12, d9, d23                @(8)
497
498    vld1.s8     d8, [r12]                   @(1n)(1-8)load 8 coeffs [col+1]
499    vsub.s8     d6, d2, d5                  @(nt-1-row) value
500
501    subs        r7, r7, #8                  @col counter
502
503    vld1.s8     d3, [r14]                   @(1n)(1-8)load 8 src[2nt+1+col]
504    vshl.s16    q13, q13, q7                @(7)shr
505
506    vdup.16     q6, r4                      @(1n)(1)
507    vsub.s8     d9, d2, d8                  @(1n)(1-8)[nt-1-col]
508
509    bne         kernel_plnr
510
511epilog:
512
513    vmovn.i16   d26, q13                    @(7)
514    vst1.s8     d26, [r2], r3               @(7)str 8 values
515
516    vshl.s16    q12, q12, q7                @(8)shr
517    vmovn.i16   d24, q12                    @(8)
518    vst1.s8     d24, [r2], r3               @(8)str 8 values
519
520@@ ========== ***************** =====================
521
522    beq         end_loop
523
524tf_sz_4:
525    vld1.s8     d10, [r14]                  @load src[2nt+1+col]
526    vld1.s8     d8, [r12], r10              @load 8 coeffs [col+1]
527loop_sz_4:
528    mov         r10, #4                     @reduce inc to #4 for 4x4
529    ldr         r7, [r6], #-1               @src[2nt-1-row] (dec to take into account row)
530    vdup.s8     d4, r7                      @src[2nt-1-row]
531
532    vsub.s8     d9, d2, d8                  @[nt-1-col]
533
534    vmull.u8    q6, d5, d0                  @(row+1)    *   src[nt-1]
535    vmlal.u8    q6, d6, d10                 @(nt-1-row) *   src[2nt+1+col]
536    vmlal.u8    q6, d8, d1                  @(col+1)    *   src[3nt+1]
537    vmlal.u8    q6, d9, d4                  @(nt-1-col) *   src[2nt-1-row]
538@   vadd.i16    q6, q6, q8          @add (nt)
539@   vshl.s16    q6, q6, q7          @shr
540@   vmovn.i16   d12, q6
541    vrshrn.s16  d12,q6,#3
542    vst1.s32    {d12[0]}, [r2], r3
543
544    vadd.s8     d5, d5, d7                  @row++ [(row+1)++]
545    vsub.s8     d6, d6, d7                  @[nt-1-row]--
546    subs        r1, r1, #1
547
548    bne         loop_sz_4
549
550end_loop:
551    vpop        {d8 - d15}
552    ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
553
554
555
556
557
558
559
560
561